Overview: In this project, we will be looking at the player data provided by FIFA which contains information such as personal details, wages, physical attributes, technical skills, potential and their positional strengths. This is primarily data of FIFA 2018.Through this project, you will get a glimpse of insights behind the beautiful game and the kind of information and decisions a football manager goes through.
Questions: Explore the data and attempt all the below asked questions in a step by step manner:
import warnings
warnings.filterwarnings('ignore')
Import libraries for EDA Project
import pandas as pd
pd.set_option('max_columns', None)
import seaborn as sns
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
Load Dataset
# Read data file in excel
df = pd.read_excel("FIFA.xlsx")
print (df)
df.head()
# Display important fields for EDA
df_new = df[['Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value','Wage', 'Preferred Positions']]
df_new.head(10)
# Prepare a rank ordered list of top 10 countries with most players.
List_of_Top10 = df.groupby('Nationality').size().sort_values(ascending=False).head(10)
print(List_of_Top10)
# List 5 countries that are producing the most numbers of footballers that play at this level
List_of_Top5 = df.groupby('Nationality').size().sort_values(ascending=False).head()
print(List_of_Top5)
plt.figure(figsize=(15,32))
sns.countplot(y = df.Nationality,palette="Set2")
Summary: Most players are from England,Germany,Spain,France,Argentina
# Check for null in Age column
df['Age'].isnull().sum()
# Visualize age of players in the dataset
plt.figure(figsize=(15,6))
sns.countplot(x="Age",data=df)
age_perf = df['Age']
plt.hist(age_perf, 20, facecolor='green')
plt.xlabel("Age_Years ")
plt.ylabel("Overall")
plt.title("Performance_With_Age")
plt.grid(True)
plt.show()
Summary: According to the dataset available ,performance of players stops improving after age of 33
# Which type of offensive players tends to get paid the most: the striker, the right-winger, or the left-winger?
# Visualize through a scatter plot for all the three
# Use important fields only for EDA
df_new.head(10)
# Check for unique Value
df_new.Value.unique()
# Make a copy of data
footballers = df_new.copy()
footballers['Unit'] = df_new['Value'].str[-1]
footballers['Value_In_M'] = np.where(footballers['Unit'] == '0', 0,
footballers['Value'].str[1:-1].replace(r'[a-zA-Z]',''))
#Check of unique values for newly added Unit column
footballers.Unit.unique()
#Check for values for newly added Unit column and delete them
footballers[footballers['Unit'].isnull()]
footballers.dropna(inplace=True)
#Check of unique values after Null removal for Unit
footballers.Unit.unique()
# Check for unique values for newly added Value_In_M column
footballers.Value_In_M.unique()
#select only non-blank
footballers = footballers[footballers['Value_In_M'] != '']
footballers.Value_In_M.unique()
footballers.head()
footballers['Value_In_M'] = footballers['Value_In_M'].astype(float)
footballers['Value_In_M'] = np.where(footballers['Unit'] == 'M',
footballers['Value_In_M'],
footballers['Value_In_M']/1000)
footballers = footballers.assign(Value=footballers['Value_In_M'],
Position=footballers['Preferred Positions'].str.split().str[0])
sns.lmplot(x='Value', y='Overall', hue='Position',
data=footballers.loc[footballers['Position'].isin(['ST', 'RW', 'LW'])],
fit_reg=False,scatter=True)
Summary : The players in strikers position (ST) gets paid the most
#Top 5 players for every preferred position in terms of overall as well as potential points. Who were the best in 2018?
top_pref_df = df[['Name', 'Age', 'Nationality', 'Overall', 'Potential', 'Club', 'Value', 'Preferred Positions']]
top_pref_df.head(10)
pref_columns = [
'Name',
'Age',
'Nationality',
'Overall',
'Potential',
'Club',
'Value',
'Wage',
'Preferred Positions'
]
top_players = pd.DataFrame(top_pref_df, columns=pref_columns)
top_players.head()
top_players['Position'] = top_players['Preferred Positions'].apply(lambda x: x[:3])
top_players.head()
Summary: Top 5 players for each position are as follows :-
Cristiano Ronaldo - ST
L. Messi - RW
Neymar - LW
L. Suárez - ST
M. Neuer - GK
#Which club(s) have the maximum share of players from England?
#Which club(s) have the maximum share of players from Spain?
#Which club(s) have the maximum share of players from Germany?
custom_columns = [
'Name',
'Nationality',
'Club',
]
club_players = pd.DataFrame(df_new, columns=custom_columns)
club_players.head()
club_players['Nationality'].value_counts()
club_dat = club_players.groupby('Club')['Club'].agg(['count']).sort_values(['count'], ascending=False)
club_dat.head()
#England_Players_In_Club = club_players['Nationality']='England'
#print(England_Players_In_Club)
England_footballers = club_players[club_players['Nationality'] == 'England']
England_footballers.Nationality.unique()
England_footballers.head()
England_footballers.count()
Eng_dat = England_footballers.groupby('Club')['Club'].agg(['count']).sort_values(['count'], ascending=False)
Eng_dat.head()
England_footballers.groupby("Club").Name.count().sort_values(ascending=False).head(5).plot(kind="pie");
Summary : The following Clubs have max share of players from England - Bolton Wanderers,Shrewsbury,Chesterfield,Colchester United,Burton Albion
Spain_footballers = club_players[club_players['Nationality'] == 'Spain']
Spain_footballers.Nationality.unique()
Spain_footballers.head()
Spain_footballers.count()
Spain_dat = Spain_footballers.groupby('Club')['Club'].agg(['count']).sort_values(['count'], ascending=False)
Spain_dat.head()
Spain_footballers.groupby("Club").Name.count().sort_values(ascending=False).head(5).plot(kind="pie");
Summary : The following Clubs have max share of players from Spain - CA Osasuna, Athletic Club de Bilbao,UD Las Palmas,Lorca Deportiva CF,Real Valladolid
German_footballers = club_players[club_players['Nationality'] == 'Germany']
German_footballers.Nationality.unique()
German_footballers.head()
German_footballers.count()
German_dat = German_footballers.groupby('Club')['Club'].agg(['count']).sort_values(['count'], ascending=False)
German_dat.head()
Summary : The following Clubs have max share of players from Germany - Holstein Kiel,SSV Jahn Regensburg,SV Meppen,FSV Zwickau,1. FC Magdeburg
#As a National coach of France team you want to compare the national team of England, Spain, Italy and Germany
#to understand the competition. The formation of the teams is
#restricted to 4-3-3 (4 defenders, 3 midfielders, 3 forwards, 1 Goal Keeper)
#and players with overall value of more than 75 are preferred,
#now form the best team for each of the mentioned countries and compare them.
#Note down all the insights that you as a business analyst should share with the coach.
df['Remaining Potential'] = df['Potential'] - df['Overall']
df['Preferred Position'] = df['Preferred Positions'].str.split().str[0]
def best_squad(position):
df_copy = df.copy()
store = []
for i in position:
store.append([i,df_copy.loc[[df_copy[df_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_copy[df_copy['Preferred Position'] == i]['Overall'].max()])
df_copy.drop(df_copy[df_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_squad(squad_433))
BEST SQUAD based on the overall value of all players
#Team England , filter for Nationality equals England (1630 players)
ENG_top_players = top_players[top_players['Nationality'] == 'England']
ENG_top_players
ENG_top_players['Remaining Potential'] = ENG_top_players['Potential'] - ENG_top_players['Overall']
ENG_top_players['Preferred Position'] = ENG_top_players['Preferred Positions'].str.split().str[0]
def best_eng_squad(position):
df_eng_copy = ENG_top_players.copy()
store_eng = []
for i in position:
store_eng.append([i,df_eng_copy.loc[[df_eng_copy[df_eng_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_eng_copy[df_eng_copy['Preferred Position'] == i]['Overall'].max()])
df_eng_copy.drop(df_eng_copy[df_eng_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store_eng).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_eng_squad(squad_433))
BEST England Squad
#Team Spain , filter for Nationality equals Spain (1019 players)
SPAIN_top_players = top_players[top_players['Nationality'] == 'Spain']
SPAIN_top_players
SPAIN_top_players['Remaining Potential'] = SPAIN_top_players['Potential'] - SPAIN_top_players['Overall']
SPAIN_top_players['Preferred Position'] = SPAIN_top_players['Preferred Positions'].str.split().str[0]
def best_spain_squad(position):
df_spain_copy = SPAIN_top_players.copy()
store_spain = []
for i in position:
store_spain.append([i,df_spain_copy.loc[[df_spain_copy[df_spain_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_spain_copy[df_spain_copy['Preferred Position'] == i]['Overall'].max()])
df_spain_copy.drop(df_spain_copy[df_spain_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store_spain).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_spain_squad(squad_433))
BEST SPAIN Squad
#Team Italy , filter for Nationality equals Italy (799 players)
ITALY_top_players = top_players[top_players['Nationality'] == 'Italy']
ITALY_top_players
ITALY_top_players['Remaining Potential'] = ITALY_top_players['Potential'] - ITALY_top_players['Overall']
ITALY_top_players['Preferred Position'] = ITALY_top_players['Preferred Positions'].str.split().str[0]
def best_italy_squad(position):
df_italy_copy = ITALY_top_players.copy()
store_italy = []
for i in position:
store_italy.append([i,df_italy_copy.loc[[df_italy_copy[df_italy_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_italy_copy[df_italy_copy['Preferred Position'] == i]['Overall'].max()])
df_italy_copy.drop(df_italy_copy[df_italy_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store_italy).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_italy_squad(squad_433))
BEST ITALY SQUAD
#Team Germany , filter for Nationality equals Germany (1140 players)
Germany_top_players = top_players[top_players['Nationality'] == 'Germany']
Germany_top_players
Germany_top_players['Remaining Potential'] = Germany_top_players['Potential'] - Germany_top_players['Overall']
Germany_top_players['Preferred Position'] = Germany_top_players['Preferred Positions'].str.split().str[0]
def best_Germany_squad(position):
df_Germany_copy = Germany_top_players.copy()
store_Germany = []
for i in position:
store_Germany.append([i,df_Germany_copy.loc[[df_Germany_copy[df_Germany_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_Germany_copy[df_Germany_copy['Preferred Position'] == i]['Overall'].max()])
df_Germany_copy.drop(df_Germany_copy[df_Germany_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store_Germany).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_Germany_squad(squad_433))
BEST GERMANY SQUAD
#Team France , filter for Nationality equals France (978 players)
France_top_players = top_players[top_players['Nationality'] == 'France']
France_top_players
France_top_players['Remaining Potential'] = France_top_players['Potential'] - France_top_players['Overall']
France_top_players['Preferred Position'] = France_top_players['Preferred Positions'].str.split().str[0]
def best_France_squad(position):
df_France_copy = France_top_players.copy()
store_France = []
for i in position:
store_France.append([i,df_France_copy.loc[[df_France_copy[df_France_copy['Preferred Position'] == i]['Overall'].idxmax()]]['Name'].to_string(index = False), df_France_copy[df_France_copy['Preferred Position'] == i]['Overall'].max()])
df_France_copy.drop(df_France_copy[df_France_copy['Preferred Position'] == i]['Overall'].idxmax(), inplace = True)
return pd.DataFrame(np.array(store_France).reshape(11,3), columns = ['Position', 'Player', 'Overall']).to_string(index = False)
# 4-3-3
squad_433 = ['GK', 'LB', 'CB', 'CB', 'RB', 'LM', 'CDM', 'RM', 'LW', 'ST', 'RW']
print ('4-3-3')
print (best_France_squad(squad_433))
BEST FRANCE SQUAD
Final Comparison of Teams - England, Spain, Italy and Germany to find competion
import pandas_profiling
ENG_top_players.profile_report()
SPAIN_top_players.profile_report()
ITALY_top_players.profile_report()
Germany_top_players.profile_report()
France_top_players.profile_report()